Code
library(ggplot2)
library(dplyr)
library(lubridate)
library(slider)
library(dplyr)
library(forecast)
library(fable)
library(tsibble)
library(tidyverse)
library(gridExtra)
library(feasts)
library(prophet)
library(fable.prophet)In this section, you should describe the time-series variable you have selected for analysis. This should include, but is not necessarily limited to:
Identification of the data source and brief description of the “data-generating process” based on visual analysis
Various summary statistics of the data as necessary
Visualizations that describe the behavior/distribution of the time-series
Visualization and discussion of a moving average of the time series
An assessment of seasonality using time-series decomposition and other visualizations as necessary (if seasonality is present)
Initial split of the data into training and test sets - all following analysis conducted on training set
library(ggplot2)
library(dplyr)
library(lubridate)
library(slider)
library(dplyr)
library(forecast)
library(fable)
library(tsibble)
library(tidyverse)
library(gridExtra)
library(feasts)
library(prophet)
library(fable.prophet)data <- read.csv("/Users/rakeshpallagani/Downloads/lumber_price.csv")
lumber_tsibble <- data %>% select(date, lumber_price) %>%
mutate(date = yearmonth(date)) %>%
as_tsibble(index = date)Train and Test Split
lumber_train <- lumber_tsibble %>%
filter(as_date(date) < ymd("2020-01-01"))
lumber_test <- lumber_tsibble %>%
filter(as_date(date) >= ymd("2020-01-01"))Summary of Lumber Price in the Data
summary_stats <- data %>%
summarise(
Mean = mean(lumber_price),
Median = median(lumber_price),
SD = sd(lumber_price),
Min = min(lumber_price),
Max = max(lumber_price)
)
print(summary_stats) Mean Median SD Min Max
1 120.8754 113.2 86.51993 19.8 581.5
Plot Analysis
hist <- data %>%
ggplot() +
ggtitle("Histogram") +
geom_histogram(aes(lumber_price)) +
theme_bw()
dens <- data %>%
ggplot() +
ggtitle("Density Plot") +
geom_density(aes(lumber_price)) +
theme_bw()
violin <- data %>%
ggplot() +
ggtitle("Violin") +
geom_violin(aes("", lumber_price)) +
theme_bw()
boxplot <- data %>%
ggplot() +
ggtitle("Boxplot") +
geom_boxplot(aes("", lumber_price)) +
theme_bw()
grid.arrange(hist, dens, violin, boxplot, ncol = 2)From the histogram and density plot, we can observe that it’s a right skewed distribution. From the boxplot, it can be seen that there are outliers.
Time Series Data Plot
data_plot <- lumber_tsibble %>%
ggplot() +
geom_line(aes(date, lumber_price)) +
theme_bw() +
xlab("Date") +
ylab("Lumber Price")
print(data_plot)Moving average analysis
We consider different moving averages and look at the right moving average which could represent the nature of the time series data, capturing the trends but has a balance of fit in the data. The blue line represents the general trend of the data but the red curve represents the ma_13 which resembles the time series data very closely.
lumber_ma <- lumber_tsibble %>%
mutate(lumber_ma_03 = zoo::rollmean(lumber_tsibble$lumber_price, k = 3, fill = NA),
lumber_ma_07 = zoo::rollmean(lumber_tsibble$lumber_price, k = 7, fill = NA),
lumber_ma_13 = zoo::rollmean(lumber_tsibble$lumber_price, k = 13, fill = NA))
lumber_ma_pivot <- lumber_ma%>%
pivot_longer(
cols = lumber_ma_03:lumber_ma_13,
values_to = "value_ma",
names_to = "ma_order"
) %>%
mutate(ma_order = factor(
ma_order,
levels = c(
"lumber_ma_03",
"lumber_ma_07",
"lumber_ma_13"
),
labels = c(
"lumber_ma_03",
"lumber_ma_07",
"lumber_ma_13"
)
))
lumber_ma_pivot %>%
mutate(ma_order = case_when(
ma_order=='lumber_ma_03'~'03rd Order',
ma_order=='lumber_ma_07'~'07th Order',
ma_order=='lumber_ma_13'~'13th Order',
)
)%>%
ggplot() +
geom_line(aes(date, lumber_price), size = 1) +
geom_line(aes(date, value_ma, color = ma_order), size = 1) +
scale_color_discrete(name = 'MA Order')+
theme_bw()+
ylab('Lumber Price')+
xlab("Time")Cumulative sum plots
lumber_tsibble %>%
mutate(
cumsum = slider::slide_dbl(lumber_price, sum, .before = Inf, .after = 0, .complete = TRUE),
cumsum_12 = slider::slide_dbl(lumber_price, sum, .before = 12, .after = 0, .complete = TRUE)
) %>%
pivot_longer(
cols = lumber_price:cumsum_12,
values_to = "lumber_price",
names_to = "cumsum_order"
) %>%
mutate(
cumsum_order = case_when(
cumsum_order == "lumber_price" ~ "Lumber Price",
cumsum_order == "cumsum" ~ "Cumulative Sum",
cumsum_order == "cumsum_12" ~ "Cumulative Sum (12 Months)"
)
) %>%
ggplot()+
geom_line(aes(date,lumber_price,color=cumsum_order),size=1)+
facet_wrap(~cumsum_order,scales='free',ncol=1)STL Decomposition
We can see that at the end there could be a minor yearly seasonality with an upward trend
lumber_tsibble %>%
model(
stl = STL(lumber_price)
) %>%
components() %>%
autoplot()The second section is focused on building an ARIMA model to your time-series. This section should include, but is not necessarily limited to:
Assessment of whether the data are variance/mean stationary using analysis of the variance over time and statistical tests such as KPSS
Transformations of the data to induce variance/mean stationarity (log/Box-Cox and differencing) or to remove seasonality (seasonal differencing)
Examination of ACF/PACF plots for assessment of potential ARIMA models to fit
Selection of an appropriate ARIMA model based on AIC/BIC
Analysis of model residuals, to include appropriate tests for residual autocorrelation (Ljung-Box)
Variance stationarity check
lumber_train %>%
mutate(rolling_sd = slide_dbl(lumber_price, sd,.before=6,.after=6)) %>%
ggplot(aes(date, rolling_sd)) +
geom_line() +
theme_bw() +
xlab("Date") +
ylab("Rolling SD of Lumber Price")+
ggtitle('Variance over time')Applying log transformation to make it variance stationary
train_transformed <- lumber_train %>%
mutate(value_log = log(lumber_price))
train_transformed %>%
ggplot() +
geom_line(aes(date, value_log)) +
theme_bw() +
ggtitle("Lumber Price over time (Log Transformed)") +
ylab("Lumber Price transformed") +
xlab("date")Let’s do a seasonal difference of 12 lags to remove minor yearly seasonal effect that we observed at the end
train_transformed_12 <-train_transformed %>%
mutate(value_diff_season = difference(value_log,12)) %>%
drop_na() %>%
as_tsibble(index=date)
train_transformed_12 %>%
gg_tsdisplay(value_diff_season,
plot_type='partial', lag=36) +
labs(title="Seasonally Differenced", y="")KPSS test for stationarity
season_value_kpss = train_transformed_12 %>%
features(value_diff_season, unitroot_kpss)
season_value_kpss# A tibble: 1 × 2
kpss_stat kpss_pvalue
<dbl> <dbl>
1 0.228 0.1
The above KPSS test confirms that process is stationary as p-value (0.1) > 0.05
ACF/PACF plots
par(mfrow = c(1, 2))
acf(train_transformed_12$value_diff_season)
pacf(train_transformed_12$value_diff_season)There is a minor yearly seasonality that can be observed from the time series decomposition, rolling standard deviation over time. So, I used log transformation to make it stationary. I have done KPSS test after that and found that the mean is stationary.
From the above ACF plot we can see that the shorter lag values have higher correlation because of the trend in the data. The correlations taper off slowly as lags increase. The time series appears to be a combination of AR and MA. PACF suggests AR(2). ACF suggests MA(6)
ARIMA model comparison
models <- lumber_train %>%
model(
mod1 = ARIMA(log(lumber_price) ~ pdq(2,1,6) + PDQ(2,1,6) + 1),
mod2 = ARIMA(log(lumber_price) ~ pdq(2,1,5) + PDQ(2,1,4) + 1),
mod3 = ARIMA(log(lumber_price) ~ pdq(2,1,3) + PDQ(2,1,3) + 1),
mod4 = ARIMA(log(lumber_price) ~ pdq(1,1,3) + PDQ(1,1,3) + 1),
mod5 = ARIMA(log(lumber_price) ~ pdq(1,1,4) + PDQ(1,1,4) + 1),
mod6 = ARIMA(log(lumber_price) ~ pdq(2,1,6) + PDQ(1,1,3) + 1),
mod7 = ARIMA(log(lumber_price) ~ pdq(2,1,4) + PDQ(1,1,2) + 1),
mod8 = ARIMA(log(lumber_price) ~ pdq(2,1,4) + PDQ(1,1,1) + 1),
mod9 = ARIMA(log(lumber_price) ~ pdq(2,1,3) + PDQ(1,1,2) + 1),
mod10 = ARIMA(log(lumber_price) ~ pdq(2,1,2) + PDQ(2,1,1) + 1)
)
models %>%
glance(mod1, mod2, mod3, mod4, mod5, mod6, mod7, mod8, mod9, mod10) %>%
arrange(BIC)# A tibble: 9 × 8
.model sigma2 log_lik AIC AICc BIC ar_roots ma_roots
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
1 mod9 0.000590 1970. -3921. -3920. -3873. <cpl [14]> <cpl [27]>
2 mod10 0.000601 1961. -3905. -3905. -3862. <cpl [26]> <cpl [14]>
3 mod8 0.000600 1962. -3905. -3905. -3857. <cpl [14]> <cpl [16]>
4 mod4 0.000601 1962. -3903. -3903. -3856. <cpl [13]> <cpl [39]>
5 mod7 0.000599 1963. -3905. -3905. -3853. <cpl [14]> <cpl [28]>
6 mod5 0.000600 1964. -3904. -3903. -3846. <cpl [13]> <cpl [52]>
7 mod3 0.000602 1962. -3900. -3900. -3843. <cpl [26]> <cpl [39]>
8 mod6 0.000601 1964. -3899. -3899. -3832. <cpl [14]> <cpl [42]>
9 mod1 0.000590 1967. -3897. -3897. -3812. <cpl [26]> <cpl [78]>
ARIMA(2,1,3)(1,1,2)[12] is the best model
best_model_auto_arima <- lumber_train %>%
model(
mod1 = ARIMA(log(lumber_price), approximation=FALSE, stepwise=FALSE)
)
best_model_auto_arima %>%
report()Series: lumber_price
Model: ARIMA(2,1,1)(2,0,0)[12] w/ drift
Transformation: log(lumber_price)
Coefficients:
ar1 ar2 ma1 sar1 sar2 constant
1.0465 -0.3377 -0.6162 0.0201 0.1001 7e-04
s.e. 0.1393 0.0545 0.1425 0.0342 0.0347 3e-04
sigma^2 estimated as 0.0006336: log likelihood=1983
AIC=-3952.01 AICc=-3951.88 BIC=-3918.59
ARIMA gives (2,1,1)(2,0,0) which is almost close to our best model intuition from the above iterations.
Observed vs Fitted values for our best model
best_model <- models %>%
select(mod9)
fitted <- best_model %>%
augment() %>%
.$.fitted
ggplot() +
geom_line(aes(lumber_train$date, lumber_train$lumber_price), color = "black") +
geom_line(aes(lumber_train$date, fitted), color = "blue", alpha=0.5) +
theme_bw() +
xlab("Date") +
ylab("Lumber Price") +
ggtitle("Observed vs Fitted (in blue) values")The in-sample predicted values matches closely to the observed values.
Residual Analysis
best_model %>%
gg_tsresiduals(lag=36)best_model %>%
augment() %>%
features(.innov, ljung_box, lag = 10, dof = 4)# A tibble: 1 × 3
.model lb_stat lb_pvalue
<chr> <dbl> <dbl>
1 mod9 9.65 0.140
Box-Ljung test for residual autocorrelation confirms that there is no auto correlation in the residuals as p-value > 0.05.
The third section is focused on building a Prophet model to your time-series. This section should include, but is not necessarily limited to:
Choosing a “best” Prophet model based on assessment of:
Decomposition of the elements of the time series (trend, seasonality, etc.) and associated visualizations
Assessment of trend/changepoints, and modification of hyperparameters if necessary (number of changepoints, range, flexibility)
Assessment of whether the model should take into account a saturating minimum/maximum point
Identification and assessment of seasonality (daily, weekly, yearly, as well as additive/multiplicative), to include visualizations, as appropriate
Initial forecast
lumber_train %>%
model(prophet = prophet(lumber_price)) %>%
forecast(h = 48) %>%
autoplot(lumber_tsibble)Decomposition into trend and seasonality
We can see that there is no multiplicative trend, could be an additive trend with chances of yearly seasonality
model = lumber_train %>%
model(prophet = fable.prophet::prophet(lumber_price))
model %>%
components() %>%
autoplot()Considering changepoints
changepoints = model %>%
glance() %>%
pull(changepoints) %>%
bind_rows() %>%
.$changepoints
lumber_train %>%
ggplot()+
geom_line(aes(date,lumber_price))+
geom_vline(xintercept=as.Date(changepoints),color='red',linetype='dashed')These are too many changepoints in consideration which is not needed.
Hence, optimising changepoints
changepoints_orig = model %>%
glance() %>%
pull(changepoints) %>%
bind_rows() %>%
filter(abs(adjustment)>0.01) %>%
.$changepoints
lumber_train %>%
ggplot()+
geom_line(aes(date,lumber_price))+
geom_vline(xintercept=as.Date(changepoints_orig),color='red',linetype='dashed')Identifying the right Prophet model
model = lumber_train %>%
model(
prophet_orig = fable.prophet::prophet(lumber_price),
prophet_1 = fable.prophet::prophet(lumber_price~growth(changepoint_prior_scale=0.01)+season(period='year')),
prophet_2 = fable.prophet::prophet(lumber_price~growth(changepoint_prior_scale=0.20)+season(period='year'))
)
changepoints_orig = model %>%
glance() %>%
unnest(changepoints) %>%
bind_rows() %>%
filter(.model == 'prophet_orig') %>%
filter(abs(adjustment)>=0.01) %>%
.$changepoints
changepoints_1= model %>%
glance() %>%
unnest(changepoints) %>%
bind_rows() %>%
filter(.model == 'prophet_1') %>%
filter(abs(adjustment)>=0.01) %>%
.$changepoints
changepoints_2 = model %>%
glance() %>%
unnest(changepoints) %>%
bind_rows() %>%
filter(.model == 'prophet_2') %>%
filter(abs(adjustment)>=0.01) %>%
.$changepoints
model %>%
forecast(h=48) %>%
autoplot(lumber_train,level=NULL)+
geom_vline(xintercept=as.Date(changepoints_orig),color='blue',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_1),color='green',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_2),color='red',linetype='dashed')Saturation Points
lumber_train %>%
model(
prophet_orig = fable.prophet::prophet(lumber_price),
prophet_saturating = fable.prophet::prophet(lumber_price~growth(type='linear',capacity=2000,floor=0)+season('year'))
) %>%
forecast(h=48) %>%
autoplot(lumber_train %>%
filter(as_date(date)>=ymd("2016-01-01")),level=NULL)We need not take into account of the saturating minimum/maximum point
Seasonality
Chance of additive seasonality being present
model = lumber_train %>%
model(
additive = fable.prophet::prophet(lumber_price~growth()+season(period='year',type='additive')),
multiplicative = fable.prophet::prophet(lumber_price~growth()+season(period='year',type='multiplicative')))
model %>%
components() %>%
autoplot()Additive seasonality
model %>%
forecast(h=48) %>%
autoplot(level=NULL)The best model is prophet original with 10 cangepoints (from prophet and hyper parameter configurations).
The final section is focused on assessing and comparing the performance of several models proposed above, to include, at a minimum, one ARIMA model, one Prophet model, and one naive model. You are welcome to compare additional model specifications/hyperparameter combinations during cross-validation as well. This section should include, but is not necessarily limited to:
Selection and implementation of an appropriate rolling window cross-validation scheme on the training set comparing a naive forecast (naive, naive with drift, or seasonal naive), the best ARIMA model from above, and an appropriate Prophet model. Additional model specifications may also be compared.
Compare the performance of the models, and identify which model performs the best at a certain forecasting horizon (e.g. t+1, t+2 etc.)
Assess the performance of the forecast from the best, selected model on the test set and discuss the performance results
Finally, refit the best model to all of the data (including train and test set), and produce a forecast for a meaningful period of time in the future (e.g. 6 months or 1 year). Discuss the forecast and any potential issues with the forecast.
In sample comparison for naive, prophet and arima
#| code-fold: true
#| warning: false
cv_data = lumber_train %>%
stretch_tsibble(.init = 72, .step = 72)
cv_forecast = cv_data %>%
model(
naive_w_season = SNAIVE(lumber_price),
arima_mod = ARIMA(log(lumber_price)~pdq(2,1,3)+PDQ(1,1,2)+1),
prophet_mod = fable.prophet::prophet(lumber_price~growth(n_changepoints=10, changepoint_prior_scale=0.20)+season(period='year'))
) %>%
forecast(h = 12)
cv_forecast %>%
as_tsibble() %>%
select(-lumber_price) %>%
left_join(
lumber_train
) %>%
ggplot()+
geom_line(aes(date,lumber_price))+
geom_line(aes(date,.mean,color=factor(.id),linetype=.model))+
scale_color_discrete(name='Iteration')+
ggtitle('SNAIVE vs ARIMA vs Prophet Forecast for Each CV Iteration')+
ylab('Lumber Price')+
xlab('Date')+
theme_bw()Accuracy comparison for in sample
cv_forecast %>%
accuracy(lumber_train)# A tibble: 3 × 10
.model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 arima_mod Test 0.196 8.67 6.26 1.93 6.08 0.598 0.568 0.835
2 naive_w_season Test 0.228 18.6 13.3 2.83 10.3 1.27 1.22 0.887
3 prophet_mod Test 0.0139 19.8 13.2 0.812 9.97 1.26 1.30 0.894
From the metrics above, we can conclude that Arima outperforms naive with seasonality and Prophet model.
RMSE plots
cv_forecast %>%
group_by(.id,.model) %>%
mutate(h = row_number()) %>%
ungroup() %>%
as_fable(response = "lumber_price", distribution = lumber_price) %>%
accuracy(lumber_train, by = c("h", ".model")) %>%
ggplot(aes(x = h, y = RMSE,color=.model)) +
geom_point()+
geom_line()+
theme_bw()+
ggtitle('RMSE of Each Model at Different Intervals')+
ylab('Average RMSE at Forecasting Intervals')+
xlab('Months in the Future')ARIMA outperforms naive with seasonality and prophet model. ARIMA is the best model for forecasting lumber price.
Out of Sample Forecast
best_model %>%
forecast(h=48) %>%
autoplot(lumber_train %>%
bind_rows(lumber_test))+
ylab('Lumber Price')+
xlab('Date')+
ggtitle('Four years Forecast of Lumber Price')+
theme_bw()Final accuracy on test set and interpretation
best_model %>%
forecast(h=48) %>%
accuracy(lumber_test)# A tibble: 1 × 10
.model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 mod9 Test 98.3 136. 100. 24.4 25.2 NaN NaN 0.839
Average difference between forecast and actual values is 25%
Final future forecast
data$date <- as.Date(data$date)
ts_lumber <- ts(data$lumber_price, frequency = 12, start = c(1947, 1), end = c(2023, 11))
arima_model <- Arima(ts_lumber, order = c(2,1,3), seasonal = list(order = c(1,1,2), period = 12), include.mean = TRUE)
forecast_result <- forecast(arima_model, h = 6)
print(forecast_result) Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
Dec 2023 243.4655 229.3352 257.5959 221.8550 265.0760
Jan 2024 250.2904 227.2178 273.3631 215.0038 285.5771
Feb 2024 261.3895 232.2363 290.5427 216.8035 305.9755
Mar 2024 267.6594 235.5307 299.7882 218.5227 316.7961
Apr 2024 271.8150 237.7019 305.9282 219.6435 323.9866
May 2024 283.2548 247.5896 318.9199 228.7097 337.7999
plot(forecast_result, main = "6 Time Period Forecast", xlab = "Time", ylab = "Lumber Price")